In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.svm import svc
In [2]:
def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x]== predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0
In [3]:
vehicle_df = pd.read_csv("vehicle-1.csv", header = 1)
vehicle_df.head(10)
Out[3]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
5 107 NaN 106.0 172.0 50.0 6 255.0 26.0 28.0 169 280.0 957.0 264.0 85.0 5.0 9.0 181.0 183 bus
6 97 43.0 73.0 173.0 65.0 6 153.0 42.0 19.0 143 176.0 361.0 172.0 66.0 13.0 1.0 200.0 204 bus
7 90 43.0 66.0 157.0 65.0 9 137.0 48.0 18.0 146 162.0 281.0 164.0 67.0 3.0 3.0 193.0 202 van
8 86 34.0 62.0 140.0 61.0 7 122.0 54.0 17.0 127 141.0 223.0 112.0 64.0 2.0 14.0 200.0 208 van
9 93 44.0 98.0 NaN 62.0 11 183.0 36.0 22.0 146 202.0 505.0 152.0 64.0 4.0 14.0 195.0 204 car
In [4]:
vehicle_df.dtypes
Out[4]:
compactness                      int64
circularity                    float64
distance_circularity           float64
radius_ratio                   float64
pr.axis_aspect_ratio           float64
max.length_aspect_ratio          int64
scatter_ratio                  float64
elongatedness                  float64
pr.axis_rectangularity         float64
max.length_rectangularity        int64
scaled_variance                float64
scaled_variance.1              float64
scaled_radius_of_gyration      float64
scaled_radius_of_gyration.1    float64
skewness_about                 float64
skewness_about.1               float64
skewness_about.2               float64
hollows_ratio                    int64
class                           object
dtype: object
In [5]:
vehicle_df.shape
Out[5]:
(846, 19)
In [6]:
vehicle_df.describe().transpose()
Out[6]:
count mean std min 25% 50% 75% max
compactness 846.0 93.678487 8.234474 73.0 87.00 93.0 100.0 119.0
circularity 841.0 44.828775 6.152172 33.0 40.00 44.0 49.0 59.0
distance_circularity 842.0 82.110451 15.778292 40.0 70.00 80.0 98.0 112.0
radius_ratio 840.0 168.888095 33.520198 104.0 141.00 167.0 195.0 333.0
pr.axis_aspect_ratio 844.0 61.678910 7.891463 47.0 57.00 61.0 65.0 138.0
max.length_aspect_ratio 846.0 8.567376 4.601217 2.0 7.00 8.0 10.0 55.0
scatter_ratio 845.0 168.901775 33.214848 112.0 147.00 157.0 198.0 265.0
elongatedness 845.0 40.933728 7.816186 26.0 33.00 43.0 46.0 61.0
pr.axis_rectangularity 843.0 20.582444 2.592933 17.0 19.00 20.0 23.0 29.0
max.length_rectangularity 846.0 147.998818 14.515652 118.0 137.00 146.0 159.0 188.0
scaled_variance 843.0 188.631079 31.411004 130.0 167.00 179.0 217.0 320.0
scaled_variance.1 844.0 439.494076 176.666903 184.0 318.00 363.5 587.0 1018.0
scaled_radius_of_gyration 844.0 174.709716 32.584808 109.0 149.00 173.5 198.0 268.0
scaled_radius_of_gyration.1 842.0 72.447743 7.486190 59.0 67.00 71.5 75.0 135.0
skewness_about 840.0 6.364286 4.920649 0.0 2.00 6.0 9.0 22.0
skewness_about.1 845.0 12.602367 8.936081 0.0 5.00 11.0 19.0 41.0
skewness_about.2 845.0 188.919527 6.155809 176.0 184.00 188.0 193.0 206.0
hollows_ratio 846.0 195.632388 7.438797 181.0 190.25 197.0 201.0 211.0

All columns except 'class' are described above. class is the target variable. Except for compactness, max.length_aspect_ratio, max.lenght_rectancgularity and hollows_ratio, all variables have missing values.

In [7]:
sns.countplot(vehicle_df['class'])
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f907dcf8>

We see that there are around 200 vans and buses and 400+ cars in the class variable.

Treatment of the missing values

In [8]:
vehicle_df = vehicle_df.fillna(vehicle_df.median())
In [9]:
vehicle_df.describe().transpose()
Out[9]:
count mean std min 25% 50% 75% max
compactness 846.0 93.678487 8.234474 73.0 87.00 93.0 100.00 119.0
circularity 846.0 44.823877 6.134272 33.0 40.00 44.0 49.00 59.0
distance_circularity 846.0 82.100473 15.741569 40.0 70.00 80.0 98.00 112.0
radius_ratio 846.0 168.874704 33.401356 104.0 141.00 167.0 195.00 333.0
pr.axis_aspect_ratio 846.0 61.677305 7.882188 47.0 57.00 61.0 65.00 138.0
max.length_aspect_ratio 846.0 8.567376 4.601217 2.0 7.00 8.0 10.00 55.0
scatter_ratio 846.0 168.887707 33.197710 112.0 147.00 157.0 198.00 265.0
elongatedness 846.0 40.936170 7.811882 26.0 33.00 43.0 46.00 61.0
pr.axis_rectangularity 846.0 20.580378 2.588558 17.0 19.00 20.0 23.00 29.0
max.length_rectangularity 846.0 147.998818 14.515652 118.0 137.00 146.0 159.00 188.0
scaled_variance 846.0 188.596927 31.360427 130.0 167.00 179.0 217.00 320.0
scaled_variance.1 846.0 439.314421 176.496341 184.0 318.25 363.5 586.75 1018.0
scaled_radius_of_gyration 846.0 174.706856 32.546277 109.0 149.00 173.5 198.00 268.0
scaled_radius_of_gyration.1 846.0 72.443262 7.468734 59.0 67.00 71.5 75.00 135.0
skewness_about 846.0 6.361702 4.903244 0.0 2.00 6.0 9.00 22.0
skewness_about.1 846.0 12.600473 8.930962 0.0 5.00 11.0 19.00 41.0
skewness_about.2 846.0 188.918440 6.152247 176.0 184.00 188.0 193.00 206.0
hollows_ratio 846.0 195.632388 7.438797 181.0 190.25 197.0 201.00 211.0

The missing values have been now replaced by the median of those variable. there are no missing values in the data set now.

Label encoding for Class

In [10]:
from sklearn import preprocessing
In [11]:
label_encoder = preprocessing.LabelEncoder() 
vehicle_df['class']= label_encoder.fit_transform(vehicle_df['class']) 
  
vehicle_df['class'].unique() 
Out[11]:
array([2, 1, 0], dtype=int64)
In [12]:
sns.countplot(vehicle_df['class'])
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f909ab00>

Van = 0, Car = 1, Bus = 2. This is the conversion of class into numeric values by using lable encoder

In [13]:
vehicle_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
compactness                    846 non-null int64
circularity                    846 non-null float64
distance_circularity           846 non-null float64
radius_ratio                   846 non-null float64
pr.axis_aspect_ratio           846 non-null float64
max.length_aspect_ratio        846 non-null int64
scatter_ratio                  846 non-null float64
elongatedness                  846 non-null float64
pr.axis_rectangularity         846 non-null float64
max.length_rectangularity      846 non-null int64
scaled_variance                846 non-null float64
scaled_variance.1              846 non-null float64
scaled_radius_of_gyration      846 non-null float64
scaled_radius_of_gyration.1    846 non-null float64
skewness_about                 846 non-null float64
skewness_about.1               846 non-null float64
skewness_about.2               846 non-null float64
hollows_ratio                  846 non-null int64
class                          846 non-null int64
dtypes: float64(14), int64(5)
memory usage: 125.7 KB

Univariate Analysis

In [14]:
sns.boxplot(vehicle_df['compactness'], fliersize=2, orient='v')
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f9119208>
In [15]:
Q1=vehicle_df['compactness'].quantile(q=0.25)
Q3=vehicle_df['compactness'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in circularity: ', L_outliers)
print('Upper outliers in circularity: ', U_outliers)
Lower outliers in circularity:  67.5
Upper outliers in circularity:  119.5
In [16]:
sns.boxplot(vehicle_df['circularity'], fliersize=2, orient='v')
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f92ba908>
In [17]:
Q1=vehicle_df['circularity'].quantile(q=0.25)
Q3=vehicle_df['circularity'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in circularity: ', L_outliers)
print('Upper outliers in circularity: ', U_outliers)
Lower outliers in circularity:  26.5
Upper outliers in circularity:  62.5
In [18]:
sns.boxplot(vehicle_df['distance_circularity'], fliersize=2, orient='v')
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f9348438>
In [19]:
Q1=vehicle_df['distance_circularity'].quantile(q=0.25)
Q3=vehicle_df['distance_circularity'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in distance_circularity: ', L_outliers)
print('Upper outliers in distance_circularity: ', U_outliers)
Lower outliers in distance_circularity:  28.0
Upper outliers in distance_circularity:  140.0
In [20]:
sns.boxplot(vehicle_df['radius_ratio'], fliersize=2, orient='v')
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f93afd30>
In [21]:
Q1=vehicle_df['radius_ratio'].quantile(q=0.25)
Q3=vehicle_df['radius_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in radius_ratio: ', L_outliers)
print('Upper outliers in radius_ratio: ', U_outliers)
Lower outliers in radius_ratio:  60.0
Upper outliers in radius_ratio:  276.0
In [22]:
sns.boxplot(vehicle_df['pr.axis_aspect_ratio'], fliersize=2, orient='v')
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f9636860>
In [23]:
Q1=vehicle_df['pr.axis_aspect_ratio'].quantile(q=0.25)
Q3=vehicle_df['pr.axis_aspect_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in pr.axis_aspect_ratio: ', L_outliers)
print('Upper outliers in pr.axis_aspect_ratio: ', U_outliers)
Lower outliers in pr.axis_aspect_ratio:  45.0
Upper outliers in pr.axis_aspect_ratio:  77.0
In [24]:
sns.boxplot(vehicle_df['max.length_aspect_ratio'], fliersize=2, orient='v')
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f968def0>
In [25]:
Q1=vehicle_df['max.length_aspect_ratio'].quantile(q=0.25)
Q3=vehicle_df['max.length_aspect_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in max.length_aspect_ratio: ', L_outliers)
print('Upper outliers in max.length_aspect_ratio: ', U_outliers)
Lower outliers in max.length_aspect_ratio:  2.5
Upper outliers in max.length_aspect_ratio:  14.5
In [26]:
sns.boxplot(vehicle_df['scatter_ratio'], fliersize=2, orient='v')
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f924c550>
In [27]:
Q1=vehicle_df['scatter_ratio'].quantile(q=0.25)
Q3=vehicle_df['scatter_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scatter_ratio: ', L_outliers)
print('Upper outliers in scatter_ratio: ', U_outliers)
Lower outliers in scatter_ratio:  70.5
Upper outliers in scatter_ratio:  274.5
In [28]:
sns.boxplot(vehicle_df['elongatedness'], fliersize=2, orient='v')
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f9720898>
In [29]:
Q1=vehicle_df['elongatedness'].quantile(q=0.25)
Q3=vehicle_df['elongatedness'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in elongatedness: ', L_outliers)
print('Upper outliers in elongatedness: ', U_outliers)
Lower outliers in elongatedness:  13.5
Upper outliers in elongatedness:  65.5
In [30]:
sns.boxplot(vehicle_df['pr.axis_rectangularity'], fliersize=2, orient='v')
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f93af710>
In [31]:
Q1=vehicle_df['pr.axis_rectangularity'].quantile(q=0.25)
Q3=vehicle_df['pr.axis_rectangularity'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in pr.axis_rectangularity: ', L_outliers)
print('Upper outliers in pr.axis_rectangularity: ', U_outliers)
Lower outliers in pr.axis_rectangularity:  13.0
Upper outliers in pr.axis_rectangularity:  29.0
In [32]:
sns.boxplot(vehicle_df['max.length_rectangularity'], fliersize=2, orient='v')
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f97cce48>
In [33]:
Q1=vehicle_df['max.length_rectangularity'].quantile(q=0.25)
Q3=vehicle_df['max.length_rectangularity'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in max.length_rectangularity: ', L_outliers)
print('Upper outliers in max.length_rectangularity: ', U_outliers)
Lower outliers in max.length_rectangularity:  104.0
Upper outliers in max.length_rectangularity:  192.0
In [34]:
sns.boxplot(vehicle_df['scaled_variance'], fliersize=2, orient='v')
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f9720dd8>
In [35]:
Q1=vehicle_df['scaled_variance'].quantile(q=0.25)
Q3=vehicle_df['scaled_variance'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scaled_variance: ', L_outliers)
print('Upper outliers in scaled_variance: ', U_outliers)
Lower outliers in scaled_variance:  92.0
Upper outliers in scaled_variance:  292.0
In [36]:
sns.boxplot(vehicle_df['scaled_variance.1'], fliersize=2, orient='v')
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f9119358>
In [37]:
Q1=vehicle_df['scaled_variance.1'].quantile(q=0.25)
Q3=vehicle_df['scaled_variance.1'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scaled_variance.1: ', L_outliers)
print('Upper outliers in scaled_variance.1: ', U_outliers)
Lower outliers in scaled_variance.1:  -84.5
Upper outliers in scaled_variance.1:  989.5
In [38]:
sns.boxplot(vehicle_df['scaled_radius_of_gyration'], fliersize=2, orient='v')
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f98e4d68>
In [39]:
Q1=vehicle_df['scaled_radius_of_gyration'].quantile(q=0.25)
Q3=vehicle_df['scaled_radius_of_gyration'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scaled_radius_of_gyration: ', L_outliers)
print('Upper outliers in scaled_radius_of_gyration: ', U_outliers)
Lower outliers in scaled_radius_of_gyration:  75.5
Upper outliers in scaled_radius_of_gyration:  271.5
In [40]:
sns.boxplot(vehicle_df['scaled_radius_of_gyration.1'], fliersize=2, orient='v')
Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f982a240>
In [41]:
Q1=vehicle_df['scaled_radius_of_gyration.1'].quantile(q=0.25)
Q3=vehicle_df['scaled_radius_of_gyration.1'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scaled_radius_of_gyration.1: ', L_outliers)
print('Upper outliers in scaled_radius_of_gyration.1: ', U_outliers)
Lower outliers in scaled_radius_of_gyration.1:  55.0
Upper outliers in scaled_radius_of_gyration.1:  87.0
In [42]:
sns.boxplot(vehicle_df['skewness_about'], fliersize=2, orient='v')
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f99a3d30>
In [43]:
Q1=vehicle_df['skewness_about'].quantile(q=0.25)
Q3=vehicle_df['skewness_about'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in skewness_about: ', L_outliers)
print('Upper outliers in skewness_about: ', U_outliers)
Lower outliers in skewness_about:  -8.5
Upper outliers in skewness_about:  19.5
In [44]:
sns.boxplot(vehicle_df['skewness_about.1'], fliersize=2, orient='v')
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f99f8b38>
In [45]:
Q1=vehicle_df['skewness_about.1'].quantile(q=0.25)
Q3=vehicle_df['skewness_about.1'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in skewness_about.1: ', L_outliers)
print('Upper outliers in skewness_about.1: ', U_outliers)
Lower outliers in skewness_about.1:  -16.0
Upper outliers in skewness_about.1:  40.0
In [46]:
sns.boxplot(vehicle_df['skewness_about.2'], fliersize=2, orient='v')
Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f9a45b38>
In [47]:
Q1=vehicle_df['skewness_about.2'].quantile(q=0.25)
Q3=vehicle_df['skewness_about.2'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in skewness_about.2: ', L_outliers)
print('Upper outliers in skewness_about.2: ', U_outliers)
Lower outliers in skewness_about.2:  170.5
Upper outliers in skewness_about.2:  206.5
In [48]:
sns.boxplot(vehicle_df['hollows_ratio'], fliersize=2, orient='v')
Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f9aa7b00>
In [49]:
Q1=vehicle_df['hollows_ratio'].quantile(q=0.25)
Q3=vehicle_df['hollows_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in hollows_ratio: ', L_outliers)
print('Upper outliers in hollows_ratio: ', U_outliers)
Lower outliers in hollows_ratio:  174.125
Upper outliers in hollows_ratio:  217.125

Outlier Treatment

In [50]:
vehicle_df1 = vehicle_df
for col_name in vehicle_df1.columns[:-1]:
    q1 = vehicle_df1[col_name].quantile(0.25)
    q3 = vehicle_df1[col_name].quantile(0.75)
    iqr = q3 - q1
    
    low = q1-1.5*iqr
    high = q3+1.5*iqr
    vehicle_df1.loc[(vehicle_df1[col_name] < low) | (vehicle_df1[col_name] > high), col_name] = vehicle_df1[col_name].median()

Outliers have been treated using the median across the entire dataframe.

In [51]:
vehicle_df1.boxplot(figsize=(40,20))
Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d9f9b09780>

The missing values and outliers have been treated in the data. The missing values were imputed by applying median in place of missing values. Label encoding was done for the varaible named 'class'. Outlier have been treated as well

Bivariate Analysis. Visual Analysis of the data

In [52]:
sns.pairplot(vehicle_df1,hue='class', diag_kind='kde')
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:488: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2
Out[52]:
<seaborn.axisgrid.PairGrid at 0x1d9f9c76d30>

Correlation matrix

In [53]:
cor=vehicle_df1.corr()
cor
Out[53]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
compactness 1.000000 0.684887 0.789928 0.721925 0.192864 0.499928 0.812620 -0.788750 0.813694 0.676143 0.769871 0.806170 0.585243 -0.246681 0.197308 0.156348 0.298537 0.365552 -0.033796
circularity 0.684887 1.000000 0.792320 0.638280 0.203253 0.560470 0.847938 -0.821472 0.843400 0.961318 0.802768 0.827462 0.925816 0.068745 0.136351 -0.009666 -0.104426 0.046351 -0.158910
distance_circularity 0.789928 0.792320 1.000000 0.794222 0.244332 0.666809 0.905076 -0.911307 0.893025 0.774527 0.869584 0.883943 0.705771 -0.229353 0.099107 0.262345 0.146098 0.332732 -0.064467
radius_ratio 0.721925 0.638280 0.794222 1.000000 0.650554 0.463958 0.769941 -0.825392 0.744139 0.579468 0.786183 0.760257 0.550774 -0.390459 0.035755 0.179601 0.405849 0.491758 -0.213948
pr.axis_aspect_ratio 0.192864 0.203253 0.244332 0.650554 1.000000 0.150295 0.194195 -0.298144 0.163047 0.147592 0.207101 0.196401 0.148591 -0.321070 -0.056030 -0.021088 0.400882 0.415734 -0.209298
max.length_aspect_ratio 0.499928 0.560470 0.666809 0.463958 0.150295 1.000000 0.490759 -0.504181 0.487931 0.642713 0.401391 0.463249 0.397397 -0.335444 0.081898 0.141664 0.083794 0.413174 0.352958
scatter_ratio 0.812620 0.847938 0.905076 0.769941 0.194195 0.490759 1.000000 -0.971601 0.989751 0.809083 0.960883 0.980447 0.799875 0.011314 0.064242 0.211647 0.005628 0.118817 -0.288895
elongatedness -0.788750 -0.821472 -0.911307 -0.825392 -0.298144 -0.504181 -0.971601 1.000000 -0.948996 -0.775854 -0.947644 -0.948851 -0.766314 0.078391 -0.046943 -0.183642 -0.115126 -0.216905 0.339344
pr.axis_rectangularity 0.813694 0.843400 0.893025 0.744139 0.163047 0.487931 0.989751 -0.948996 1.000000 0.810934 0.947329 0.973606 0.796690 0.027545 0.073127 0.213801 -0.018649 0.099286 -0.258481
max.length_rectangularity 0.676143 0.961318 0.774527 0.579468 0.147592 0.642713 0.809083 -0.775854 0.810934 1.000000 0.750222 0.789632 0.866450 0.053856 0.130702 0.004129 -0.103948 0.076770 -0.032399
scaled_variance 0.769871 0.802768 0.869584 0.786183 0.207101 0.401391 0.960883 -0.947644 0.947329 0.750222 1.000000 0.943780 0.785073 0.025828 0.024693 0.197122 0.015171 0.086330 -0.324062
scaled_variance.1 0.806170 0.827462 0.883943 0.760257 0.196401 0.463249 0.980447 -0.948851 0.973606 0.789632 0.943780 1.000000 0.782972 0.009386 0.065731 0.204941 0.017557 0.119642 -0.279487
scaled_radius_of_gyration 0.585243 0.925816 0.705771 0.550774 0.148591 0.397397 0.799875 -0.766314 0.796690 0.866450 0.785073 0.782972 1.000000 0.215279 0.162970 -0.055667 -0.224450 -0.118002 -0.250267
scaled_radius_of_gyration.1 -0.246681 0.068745 -0.229353 -0.390459 -0.321070 -0.335444 0.011314 0.078391 0.027545 0.053856 0.025828 0.009386 0.215279 1.000000 -0.057755 -0.123996 -0.832738 -0.901332 -0.283540
skewness_about 0.197308 0.136351 0.099107 0.035755 -0.056030 0.081898 0.064242 -0.046943 0.073127 0.130702 0.024693 0.065731 0.162970 -0.057755 1.000000 -0.041734 0.086661 0.062619 0.126720
skewness_about.1 0.156348 -0.009666 0.262345 0.179601 -0.021088 0.141664 0.211647 -0.183642 0.213801 0.004129 0.197122 0.204941 -0.055667 -0.123996 -0.041734 1.000000 0.074473 0.200651 -0.010872
skewness_about.2 0.298537 -0.104426 0.146098 0.405849 0.400882 0.083794 0.005628 -0.115126 -0.018649 -0.103948 0.015171 0.017557 -0.224450 -0.832738 0.086661 0.074473 1.000000 0.892581 0.067244
hollows_ratio 0.365552 0.046351 0.332732 0.491758 0.415734 0.413174 0.118817 -0.216905 0.099286 0.076770 0.086330 0.119642 -0.118002 -0.901332 0.062619 0.200651 0.892581 1.000000 0.235874
class -0.033796 -0.158910 -0.064467 -0.213948 -0.209298 0.352958 -0.288895 0.339344 -0.258481 -0.032399 -0.324062 -0.279487 -0.250267 -0.283540 0.126720 -0.010872 0.067244 0.235874 1.000000
In [54]:
sns.set(font_scale=1.15)
plt.figure(figsize=(20,15))

sns.heatmap(cor, vmax=.8, linewidths=0.01,
            square=True,annot=True,cmap="coolwarm",linecolor="black")
plt.title('Correlation between features');

max.length_aspect_ratio, elongatedness, skewness_about, skewness_about1, and hollows_ratio show positive correlation with target variable class.

In [55]:
vehicle_df1.shape
Out[55]:
(846, 19)
In [56]:
vehicle_df1.groupby(["class"]).count()
Out[56]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
class
0 218 218 218 218 218 218 218 218 218 218 218 218 218 218 218 218 218 218
1 429 429 429 429 429 429 429 429 429 429 429 429 429 429 429 429 429 429
2 199 199 199 199 199 199 199 199 199 199 199 199 199 199 199 199 199 199

Class imbalance is checked for. There are more cars in the dataset than vans and buses. The number of cars in double compared to vans and buses and may have an influence on the classification.

In [57]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
array = vehicle_df1.values
X = array[:,0:18]
Y = array[:,18]
le = preprocessing.LabelEncoder()
le.fit(Y)
test_size = 0.30 # taking 70:30 training and test set
seed = 7  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
type(X_train)
Out[57]:
numpy.ndarray
In [58]:
clf = svm.SVC(gamma= 0.0025, C=3)    
In [59]:
clf.fit(X_train , y_train)
Out[59]:
SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0025, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [60]:
y_pred = clf.predict(X_test)
In [61]:
SVM_Accuracy = getAccuracy(y_test , y_pred)   # Calling getAccuracy function instead of usinig array comparison to get the %age accuracy
SVM_Accuracy
Out[61]:
88.9763779527559

Accuracy of SVM Model is 88.98%

In [62]:
from sklearn import metrics
In [63]:
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

Applying KFold to SVM

In [64]:
vehiclekfold_df = vehicle_df1
array = vehiclekfold_df.values
X = array[:,0:18]
y = array[:,18]
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.50, random_state=1)
num_folds = 50
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
results1 = cross_val_score(clf,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Average accuracy: ',accuracy)
print('Standard Deviation: ',results1.std())
Average accuracy:  0.9079411764705883
Standard Deviation:  0.06901572585451025

Accuracy of SVM model with KFold validation is improved by approx 2% and is 90.80%

Applying PCA on the data

In [65]:
covMatrix = np.cov(X,rowvar=False)
print(covMatrix)
[[ 6.78065662e+01  3.45953782e+01  1.02393288e+02  1.91108904e+02
   8.91493278e+00  8.49722047e+00  2.22142552e+02 -5.07377062e+01
   1.73442164e+01  8.08185544e+01  1.96740865e+02  1.15737644e+03
   1.56845875e+02 -1.25103809e+01  7.42980962e+00  1.14292109e+01
   1.51240421e+01  2.23917272e+01]
 [ 3.45953782e+01  3.76292990e+01  7.65088408e+01  1.25871378e+02
   6.99890190e+00  7.09658260e+00  1.72677241e+02 -3.93651013e+01
   1.33922797e+01  8.55986081e+01  1.52824981e+02  8.84959558e+02
   1.84837067e+02  2.59720019e+00  3.82487026e+00 -5.26406200e-01
  -3.94100885e+00  2.11506008e+00]
 [ 1.02393288e+02  7.65088408e+01  2.47796994e+02  4.01922920e+02
   2.15903409e+01  2.16662162e+01  4.72978160e+02 -1.12064585e+02
   3.63889560e+01  1.76978817e+02  4.24815891e+02  2.42596483e+03
   3.61587476e+02 -2.22357722e+01  7.13424958e+00  3.66615077e+01
   1.41490327e+01  3.89624225e+01]
 [ 1.91108904e+02  1.25871378e+02  4.01922920e+02  1.03348796e+03
   1.17399578e+02  3.07868675e+01  8.21709399e+02 -2.07285358e+02
   6.19248451e+01  2.70407492e+02  7.84363937e+02  4.26113678e+03
   5.76271693e+02 -7.73086142e+01  5.25636549e+00  5.12565641e+01
   8.02696294e+01  1.17599919e+02]
 [ 8.91493278e+00  6.99890190e+00  2.15903409e+01  1.17399578e+02
   3.15109055e+01  1.74143830e+00  3.61890064e+01 -1.30741282e+01
   2.36919580e+00  1.20262188e+01  3.60788591e+01  1.92214092e+02
   2.71472212e+01 -1.11001714e+01 -1.43829088e+00 -1.05090716e+00
   1.38445969e+01  1.73599494e+01]
 [ 8.49722047e+00  7.09658260e+00  2.16662162e+01  3.07868675e+01
   1.74143830e+00  4.26056766e+00  3.36286737e+01 -8.12972429e+00
   2.60705583e+00  1.92569446e+01  2.57123953e+01  1.66709375e+02
   2.66968218e+01 -4.26436135e+00  7.73046848e-01  2.59585659e+00
   1.06408997e+00  6.34409613e+00]
 [ 2.22142552e+02  1.72677241e+02  4.72978160e+02  8.21709399e+02
   3.61890064e+01  3.36286737e+01  1.10208797e+03 -2.51971673e+02
   8.50534153e+01  3.89886258e+02  9.89964349e+02  5.67471671e+03
   8.64233907e+02  2.31321429e+00  9.75270609e+00  6.23747646e+01
   1.14941038e+00  2.93421028e+01]
 [-5.07377062e+01 -3.93651013e+01 -1.12064585e+02 -2.07285358e+02
  -1.30741282e+01 -8.12972429e+00 -2.51971673e+02  6.10255067e+01
  -1.91901297e+01 -8.79775903e+01 -2.29742918e+02 -1.29230745e+03
  -1.94833526e+02  3.77155986e+00 -1.67694826e+00 -1.27354904e+01
  -5.53302279e+00 -1.26045575e+01]
 [ 1.73442164e+01  1.33922797e+01  3.63889560e+01  6.19248451e+01
   2.36919580e+00  2.60705583e+00  8.50534153e+01 -1.91901297e+01
   6.70063228e+00  3.04705093e+01  7.61026536e+01  4.39393168e+02
   6.71194483e+01  4.39133689e-01  8.65624519e-01  4.91311147e+00
  -2.96986865e-01  1.91183152e+00]
 [ 8.08185544e+01  8.55986081e+01  1.76978817e+02  2.70407492e+02
   1.20262188e+01  1.92569446e+01  3.89886258e+02 -8.79775903e+01
   3.04705093e+01  2.10704141e+02  3.37961456e+02  1.99835952e+03
   4.09337523e+02  4.81472645e+00  8.67591450e+00  5.32029600e-01
  -9.28293676e+00  8.28950578e+00]
 [ 1.96740865e+02  1.52824981e+02  4.24815891e+02  7.84363937e+02
   3.60788591e+01  2.57123953e+01  9.89964349e+02 -2.29742918e+02
   7.61026536e+01  3.37961456e+02  9.63123533e+02  5.10650610e+03
   7.92961780e+02  4.93661225e+00  3.50439940e+00  5.43080896e+01
   2.89667212e+00  1.99299537e+01]
 [ 1.15737644e+03  8.84959558e+02  2.42596483e+03  4.26113678e+03
   1.92214092e+02  1.66709375e+02  5.67471671e+03 -1.29230745e+03
   4.39393168e+02  1.99835952e+03  5.10650610e+03  3.03965503e+04
   4.44283031e+03  1.00781799e+01  5.24055059e+01  3.17198233e+02
   1.88320282e+01  1.55166784e+02]
 [ 1.56845875e+02  1.84837067e+02  3.61587476e+02  5.76271693e+02
   2.71472212e+01  2.66968218e+01  8.64233907e+02 -1.94833526e+02
   6.71194483e+01  4.09337523e+02  7.92961780e+02  4.44283031e+03
   1.05926012e+03  4.31521605e+01  2.42551988e+01 -1.60838782e+01
  -4.49422804e+01 -2.85688377e+01]
 [-1.25103809e+01  2.59720019e+00 -2.22357722e+01 -7.73086142e+01
  -1.11001714e+01 -4.26436135e+00  2.31321429e+00  3.77155986e+00
   4.39133689e-01  4.81472645e+00  4.93661225e+00  1.00781799e+01
   4.31521605e+01  3.79314592e+01 -1.62660554e+00 -6.77948158e+00
  -3.15531055e+01 -4.12940374e+01]
 [ 7.42980962e+00  3.82487026e+00  7.13424958e+00  5.25636549e+00
  -1.43829088e+00  7.73046848e-01  9.75270609e+00 -1.67694826e+00
   8.65624519e-01  8.67591450e+00  3.50439940e+00  5.24055059e+01
   2.42551988e+01 -1.62660554e+00  2.09118707e+01 -1.69423252e+00
   2.43810063e+00  2.13012016e+00]
 [ 1.14292109e+01 -5.26406200e-01  3.66615077e+01  5.12565641e+01
  -1.05090716e+00  2.59585659e+00  6.23747646e+01 -1.27354904e+01
   4.91311147e+00  5.32029600e-01  5.43080896e+01  3.17198233e+02
  -1.60838782e+01 -6.77948158e+00 -1.69423252e+00  7.88093779e+01
   4.06743884e+00  1.32505546e+01]
 [ 1.51240421e+01 -3.94100885e+00  1.41490327e+01  8.02696294e+01
   1.38445969e+01  1.06408997e+00  1.14941038e+00 -5.53302279e+00
  -2.96986865e-01 -9.28293676e+00  2.89667212e+00  1.88320282e+01
  -4.49422804e+01 -3.15531055e+01  2.43810063e+00  4.06743884e+00
   3.78501448e+01  4.08492719e+01]
 [ 2.23917272e+01  2.11506008e+00  3.89624225e+01  1.17599919e+02
   1.73599494e+01  6.34409613e+00  2.93421028e+01 -1.26045575e+01
   1.91183152e+00  8.28950578e+00  1.99299537e+01  1.55166784e+02
  -2.85688377e+01 -4.12940374e+01  2.13012016e+00  1.32505546e+01
   4.08492719e+01  5.53357072e+01]]
In [66]:
pca = PCA(n_components=6)
pca.fit(X)
Out[66]:
PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
In [67]:
print(pca.explained_variance_)
[34058.40257798   576.59672368   422.84425672   112.72332963
    88.46477087    57.93632194]
In [68]:
print(pca.components_)
[[ 3.61201326e-02  2.78577494e-02  7.60066421e-02  1.34573101e-01
   6.37248536e-03  5.26082083e-03  1.77019465e-01 -4.04237720e-02
   1.36964042e-02  6.28298081e-02  1.59835491e-01  9.44216679e-01
   1.40158604e-01  1.19126724e-04  1.68662387e-03  9.64527873e-03
   7.18367027e-04  5.05536158e-03]
 [-6.49878462e-02  6.23549009e-02 -9.99873445e-02 -7.14180537e-01
  -1.35918427e-01 -1.01650227e-02  4.20806627e-03  3.37862674e-02
   3.22842588e-03  1.53598573e-01 -4.42407407e-02  3.12620202e-02
   5.41322300e-01  1.86296947e-01  1.85636078e-02 -9.49944502e-02
  -1.90949919e-01 -2.28027323e-01]
 [ 1.90346104e-02  1.14367435e-01  1.10349593e-01  5.18826308e-01
   1.25340241e-01  1.99836567e-02  8.39463371e-02 -4.69288056e-02
   4.30195906e-03  2.34682152e-01  1.78766374e-01 -2.55984397e-01
   7.10236477e-01 -4.11287130e-02  2.92537219e-02 -1.31502686e-01
   2.48738362e-02  4.86483656e-02]
 [-9.76817468e-02 -8.04950643e-02 -4.13718720e-02  1.35371534e-02
  -7.18108998e-02 -6.33104582e-02  2.50591474e-01 -5.74011328e-02
   1.61222936e-02 -2.81557429e-01  7.68488833e-01 -1.41164491e-01
  -7.88859335e-02  2.16729962e-01 -1.02852528e-01  1.98113941e-01
  -1.78522629e-01 -2.86178183e-01]
 [ 1.60651903e-01  7.29621322e-02  5.24453226e-01 -2.48377070e-01
  -2.02310492e-01  1.05046116e-01  2.99527547e-01 -6.16154330e-02
   2.53976178e-02  3.27991295e-01  1.07366725e-01 -1.12919400e-01
  -4.72590514e-02 -2.05911741e-01  5.64412059e-02  4.74090519e-01
   8.93380215e-02  2.71264070e-01]
 [ 2.16753272e-01  3.15569938e-02  9.35460578e-02 -1.57416953e-01
  -1.49491551e-01  2.10194626e-02  1.72591313e-01 -6.81571188e-02
   1.18742194e-02  1.61056900e-01  3.03833861e-01 -5.01106304e-02
  -2.24690344e-01 -1.15429913e-01  3.62776170e-03 -8.01195106e-01
   1.59604073e-01  9.56664566e-02]]
In [69]:
print(pca.explained_variance_ratio_)
[0.96066937 0.01626379 0.01192697 0.00317953 0.00249528 0.00163418]
In [70]:
plt.bar(list(range(1,7)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
In [71]:
plt.step(list(range(1,7)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()

With 5 variables we can explain over 99.5% of the variation in the original data! With 3 Variable we are able to explain about 98.9% of the variation

In [73]:
pca3 = PCA(n_components=3)
pca3.fit(X)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(X)
[[ 3.61201326e-02  2.78577494e-02  7.60066421e-02  1.34573101e-01
   6.37248536e-03  5.26082083e-03  1.77019465e-01 -4.04237720e-02
   1.36964042e-02  6.28298081e-02  1.59835491e-01  9.44216679e-01
   1.40158604e-01  1.19126724e-04  1.68662387e-03  9.64527873e-03
   7.18367027e-04  5.05536158e-03]
 [-6.49878462e-02  6.23549009e-02 -9.99873445e-02 -7.14180537e-01
  -1.35918427e-01 -1.01650227e-02  4.20806627e-03  3.37862674e-02
   3.22842588e-03  1.53598573e-01 -4.42407407e-02  3.12620202e-02
   5.41322300e-01  1.86296947e-01  1.85636078e-02 -9.49944502e-02
  -1.90949919e-01 -2.28027323e-01]
 [ 1.90346104e-02  1.14367435e-01  1.10349593e-01  5.18826308e-01
   1.25340241e-01  1.99836567e-02  8.39463371e-02 -4.69288056e-02
   4.30195906e-03  2.34682152e-01  1.78766374e-01 -2.55984397e-01
   7.10236477e-01 -4.11287130e-02  2.92537219e-02 -1.31502686e-01
   2.48738362e-02  4.86483656e-02]]
[0.96066937 0.01626379 0.01192697]
In [74]:
sns.pairplot(pd.DataFrame(Xpca3))
Out[74]:
<seaborn.axisgrid.PairGrid at 0x1d9908ec240>

Fit SVM Model with PCA

In [78]:
SVM_model = SVC()
SVM_model.fit(Xpca3, Y)
SVM_model.score(Xpca3, Y)
Out[78]:
0.9988179669030733

The model accuracy of SVM is 88.98%. With KFold validation the accuracy of SVM model is increased by 2% to 90.8%. Using PCA, the model accuracy is increased to 99.88%